# -*-coding:utf-8-*-
__author__ = 'Administrator'
from bs4 import BeautifulSoup
import re
import urlparse

class HtmlParser(object):

    def _get_new_urls(self, page_url, soup):
        new_urls = set()
        #links = soup.find_all('a',href=re.compile(r'/view/\d+\.html'))
        links = soup.find_all("a",href=re.compile(r"/view/\d+\.htm"))
        for link in links:
            new_url = link['href']
            new_full_url=urlparse.urljoin(page_url,new_url)
            new_urls.add(new_full_url)
        return new_urls